House Prices - Kaggle Prediction Competition

1. Getting prepared

First of all, I install and load the required packages:

# install.packages("tidyverse")
# install.packages("ggplot2")
# install.packages("plotly")
# install.packages("reshape2")
# install.packages("moments")
# install.packages("dplyr")
# install.packages("e1071")
# install.packages("glmnet")
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(plotly)
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
library(reshape2)
## 
## Attaching package: 'reshape2'
## 
## The following object is masked from 'package:tidyr':
## 
##     smiths
library(moments)
library(dplyr)

Then, I upload the training dataset and call it train_data:

train_data <- read_csv("train.csv")
## Rows: 1460 Columns: 81
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (43): MSZoning, Street, Alley, LotShape, LandContour, Utilities, LotConf...
## dbl (38): Id, MSSubClass, LotFrontage, LotArea, OverallQual, OverallCond, Ye...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Afterwards, I explore the dataset and check for data completeness.

head(train_data)
summary(train_data)
##        Id           MSSubClass      MSZoning          LotFrontage    
##  Min.   :   1.0   Min.   : 20.0   Length:1460        Min.   : 21.00  
##  1st Qu.: 365.8   1st Qu.: 20.0   Class :character   1st Qu.: 59.00  
##  Median : 730.5   Median : 50.0   Mode  :character   Median : 69.00  
##  Mean   : 730.5   Mean   : 56.9                      Mean   : 70.05  
##  3rd Qu.:1095.2   3rd Qu.: 70.0                      3rd Qu.: 80.00  
##  Max.   :1460.0   Max.   :190.0                      Max.   :313.00  
##                                                      NA's   :259     
##     LotArea          Street             Alley             LotShape        
##  Min.   :  1300   Length:1460        Length:1460        Length:1460       
##  1st Qu.:  7554   Class :character   Class :character   Class :character  
##  Median :  9478   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 10517                                                           
##  3rd Qu.: 11602                                                           
##  Max.   :215245                                                           
##                                                                           
##  LandContour         Utilities          LotConfig          LandSlope        
##  Length:1460        Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  Neighborhood        Condition1         Condition2          BldgType        
##  Length:1460        Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##   HouseStyle         OverallQual      OverallCond      YearBuilt   
##  Length:1460        Min.   : 1.000   Min.   :1.000   Min.   :1872  
##  Class :character   1st Qu.: 5.000   1st Qu.:5.000   1st Qu.:1954  
##  Mode  :character   Median : 6.000   Median :5.000   Median :1973  
##                     Mean   : 6.099   Mean   :5.575   Mean   :1971  
##                     3rd Qu.: 7.000   3rd Qu.:6.000   3rd Qu.:2000  
##                     Max.   :10.000   Max.   :9.000   Max.   :2010  
##                                                                    
##   YearRemodAdd   RoofStyle           RoofMatl         Exterior1st       
##  Min.   :1950   Length:1460        Length:1460        Length:1460       
##  1st Qu.:1967   Class :character   Class :character   Class :character  
##  Median :1994   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :1985                                                           
##  3rd Qu.:2004                                                           
##  Max.   :2010                                                           
##                                                                         
##  Exterior2nd         MasVnrType          MasVnrArea      ExterQual        
##  Length:1460        Length:1460        Min.   :   0.0   Length:1460       
##  Class :character   Class :character   1st Qu.:   0.0   Class :character  
##  Mode  :character   Mode  :character   Median :   0.0   Mode  :character  
##                                        Mean   : 103.7                     
##                                        3rd Qu.: 166.0                     
##                                        Max.   :1600.0                     
##                                        NA's   :8                          
##   ExterCond          Foundation          BsmtQual           BsmtCond        
##  Length:1460        Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  BsmtExposure       BsmtFinType1         BsmtFinSF1     BsmtFinType2      
##  Length:1460        Length:1460        Min.   :   0.0   Length:1460       
##  Class :character   Class :character   1st Qu.:   0.0   Class :character  
##  Mode  :character   Mode  :character   Median : 383.5   Mode  :character  
##                                        Mean   : 443.6                     
##                                        3rd Qu.: 712.2                     
##                                        Max.   :5644.0                     
##                                                                           
##    BsmtFinSF2        BsmtUnfSF       TotalBsmtSF       Heating         
##  Min.   :   0.00   Min.   :   0.0   Min.   :   0.0   Length:1460       
##  1st Qu.:   0.00   1st Qu.: 223.0   1st Qu.: 795.8   Class :character  
##  Median :   0.00   Median : 477.5   Median : 991.5   Mode  :character  
##  Mean   :  46.55   Mean   : 567.2   Mean   :1057.4                     
##  3rd Qu.:   0.00   3rd Qu.: 808.0   3rd Qu.:1298.2                     
##  Max.   :1474.00   Max.   :2336.0   Max.   :6110.0                     
##                                                                        
##   HeatingQC          CentralAir         Electrical           1stFlrSF   
##  Length:1460        Length:1460        Length:1460        Min.   : 334  
##  Class :character   Class :character   Class :character   1st Qu.: 882  
##  Mode  :character   Mode  :character   Mode  :character   Median :1087  
##                                                           Mean   :1163  
##                                                           3rd Qu.:1391  
##                                                           Max.   :4692  
##                                                                         
##     2ndFlrSF     LowQualFinSF       GrLivArea     BsmtFullBath   
##  Min.   :   0   Min.   :  0.000   Min.   : 334   Min.   :0.0000  
##  1st Qu.:   0   1st Qu.:  0.000   1st Qu.:1130   1st Qu.:0.0000  
##  Median :   0   Median :  0.000   Median :1464   Median :0.0000  
##  Mean   : 347   Mean   :  5.845   Mean   :1515   Mean   :0.4253  
##  3rd Qu.: 728   3rd Qu.:  0.000   3rd Qu.:1777   3rd Qu.:1.0000  
##  Max.   :2065   Max.   :572.000   Max.   :5642   Max.   :3.0000  
##                                                                  
##   BsmtHalfBath        FullBath        HalfBath       BedroomAbvGr  
##  Min.   :0.00000   Min.   :0.000   Min.   :0.0000   Min.   :0.000  
##  1st Qu.:0.00000   1st Qu.:1.000   1st Qu.:0.0000   1st Qu.:2.000  
##  Median :0.00000   Median :2.000   Median :0.0000   Median :3.000  
##  Mean   :0.05753   Mean   :1.565   Mean   :0.3829   Mean   :2.866  
##  3rd Qu.:0.00000   3rd Qu.:2.000   3rd Qu.:1.0000   3rd Qu.:3.000  
##  Max.   :2.00000   Max.   :3.000   Max.   :2.0000   Max.   :8.000  
##                                                                    
##   KitchenAbvGr   KitchenQual         TotRmsAbvGrd     Functional       
##  Min.   :0.000   Length:1460        Min.   : 2.000   Length:1460       
##  1st Qu.:1.000   Class :character   1st Qu.: 5.000   Class :character  
##  Median :1.000   Mode  :character   Median : 6.000   Mode  :character  
##  Mean   :1.047                      Mean   : 6.518                     
##  3rd Qu.:1.000                      3rd Qu.: 7.000                     
##  Max.   :3.000                      Max.   :14.000                     
##                                                                        
##    Fireplaces    FireplaceQu         GarageType         GarageYrBlt  
##  Min.   :0.000   Length:1460        Length:1460        Min.   :1900  
##  1st Qu.:0.000   Class :character   Class :character   1st Qu.:1961  
##  Median :1.000   Mode  :character   Mode  :character   Median :1980  
##  Mean   :0.613                                         Mean   :1979  
##  3rd Qu.:1.000                                         3rd Qu.:2002  
##  Max.   :3.000                                         Max.   :2010  
##                                                        NA's   :81    
##  GarageFinish         GarageCars      GarageArea      GarageQual       
##  Length:1460        Min.   :0.000   Min.   :   0.0   Length:1460       
##  Class :character   1st Qu.:1.000   1st Qu.: 334.5   Class :character  
##  Mode  :character   Median :2.000   Median : 480.0   Mode  :character  
##                     Mean   :1.767   Mean   : 473.0                     
##                     3rd Qu.:2.000   3rd Qu.: 576.0                     
##                     Max.   :4.000   Max.   :1418.0                     
##                                                                        
##   GarageCond         PavedDrive          WoodDeckSF      OpenPorchSF    
##  Length:1460        Length:1460        Min.   :  0.00   Min.   :  0.00  
##  Class :character   Class :character   1st Qu.:  0.00   1st Qu.:  0.00  
##  Mode  :character   Mode  :character   Median :  0.00   Median : 25.00  
##                                        Mean   : 94.24   Mean   : 46.66  
##                                        3rd Qu.:168.00   3rd Qu.: 68.00  
##                                        Max.   :857.00   Max.   :547.00  
##                                                                         
##  EnclosedPorch      3SsnPorch       ScreenPorch        PoolArea      
##  Min.   :  0.00   Min.   :  0.00   Min.   :  0.00   Min.   :  0.000  
##  1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.000  
##  Median :  0.00   Median :  0.00   Median :  0.00   Median :  0.000  
##  Mean   : 21.95   Mean   :  3.41   Mean   : 15.06   Mean   :  2.759  
##  3rd Qu.:  0.00   3rd Qu.:  0.00   3rd Qu.:  0.00   3rd Qu.:  0.000  
##  Max.   :552.00   Max.   :508.00   Max.   :480.00   Max.   :738.000  
##                                                                      
##     PoolQC             Fence           MiscFeature           MiscVal        
##  Length:1460        Length:1460        Length:1460        Min.   :    0.00  
##  Class :character   Class :character   Class :character   1st Qu.:    0.00  
##  Mode  :character   Mode  :character   Mode  :character   Median :    0.00  
##                                                           Mean   :   43.49  
##                                                           3rd Qu.:    0.00  
##                                                           Max.   :15500.00  
##                                                                             
##      MoSold           YrSold       SaleType         SaleCondition     
##  Min.   : 1.000   Min.   :2006   Length:1460        Length:1460       
##  1st Qu.: 5.000   1st Qu.:2007   Class :character   Class :character  
##  Median : 6.000   Median :2008   Mode  :character   Mode  :character  
##  Mean   : 6.322   Mean   :2008                                        
##  3rd Qu.: 8.000   3rd Qu.:2009                                        
##  Max.   :12.000   Max.   :2010                                        
##                                                                       
##    SalePrice     
##  Min.   : 34900  
##  1st Qu.:129975  
##  Median :163000  
##  Mean   :180921  
##  3rd Qu.:214000  
##  Max.   :755000  
## 
glimpse(train_data)
## Rows: 1,460
## Columns: 81
## $ Id            <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1…
## $ MSSubClass    <dbl> 60, 20, 60, 70, 60, 50, 20, 60, 50, 190, 20, 60, 20, 20,…
## $ MSZoning      <chr> "RL", "RL", "RL", "RL", "RL", "RL", "RL", "RL", "RM", "R…
## $ LotFrontage   <dbl> 65, 80, 68, 60, 84, 85, 75, NA, 51, 50, 70, 85, NA, 91, …
## $ LotArea       <dbl> 8450, 9600, 11250, 9550, 14260, 14115, 10084, 10382, 612…
## $ Street        <chr> "Pave", "Pave", "Pave", "Pave", "Pave", "Pave", "Pave", …
## $ Alley         <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ LotShape      <chr> "Reg", "Reg", "IR1", "IR1", "IR1", "IR1", "Reg", "IR1", …
## $ LandContour   <chr> "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", …
## $ Utilities     <chr> "AllPub", "AllPub", "AllPub", "AllPub", "AllPub", "AllPu…
## $ LotConfig     <chr> "Inside", "FR2", "Inside", "Corner", "FR2", "Inside", "I…
## $ LandSlope     <chr> "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", …
## $ Neighborhood  <chr> "CollgCr", "Veenker", "CollgCr", "Crawfor", "NoRidge", "…
## $ Condition1    <chr> "Norm", "Feedr", "Norm", "Norm", "Norm", "Norm", "Norm",…
## $ Condition2    <chr> "Norm", "Norm", "Norm", "Norm", "Norm", "Norm", "Norm", …
## $ BldgType      <chr> "1Fam", "1Fam", "1Fam", "1Fam", "1Fam", "1Fam", "1Fam", …
## $ HouseStyle    <chr> "2Story", "1Story", "2Story", "2Story", "2Story", "1.5Fi…
## $ OverallQual   <dbl> 7, 6, 7, 7, 8, 5, 8, 7, 7, 5, 5, 9, 5, 7, 6, 7, 6, 4, 5,…
## $ OverallCond   <dbl> 5, 8, 5, 5, 5, 5, 5, 6, 5, 6, 5, 5, 6, 5, 5, 8, 7, 5, 5,…
## $ YearBuilt     <dbl> 2003, 1976, 2001, 1915, 2000, 1993, 2004, 1973, 1931, 19…
## $ YearRemodAdd  <dbl> 2003, 1976, 2002, 1970, 2000, 1995, 2005, 1973, 1950, 19…
## $ RoofStyle     <chr> "Gable", "Gable", "Gable", "Gable", "Gable", "Gable", "G…
## $ RoofMatl      <chr> "CompShg", "CompShg", "CompShg", "CompShg", "CompShg", "…
## $ Exterior1st   <chr> "VinylSd", "MetalSd", "VinylSd", "Wd Sdng", "VinylSd", "…
## $ Exterior2nd   <chr> "VinylSd", "MetalSd", "VinylSd", "Wd Shng", "VinylSd", "…
## $ MasVnrType    <chr> "BrkFace", "None", "BrkFace", "None", "BrkFace", "None",…
## $ MasVnrArea    <dbl> 196, 0, 162, 0, 350, 0, 186, 240, 0, 0, 0, 286, 0, 306, …
## $ ExterQual     <chr> "Gd", "TA", "Gd", "TA", "Gd", "TA", "Gd", "TA", "TA", "T…
## $ ExterCond     <chr> "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "T…
## $ Foundation    <chr> "PConc", "CBlock", "PConc", "BrkTil", "PConc", "Wood", "…
## $ BsmtQual      <chr> "Gd", "Gd", "Gd", "TA", "Gd", "Gd", "Ex", "Gd", "TA", "T…
## $ BsmtCond      <chr> "TA", "TA", "TA", "Gd", "TA", "TA", "TA", "TA", "TA", "T…
## $ BsmtExposure  <chr> "No", "Gd", "Mn", "No", "Av", "No", "Av", "Mn", "No", "N…
## $ BsmtFinType1  <chr> "GLQ", "ALQ", "GLQ", "ALQ", "GLQ", "GLQ", "GLQ", "ALQ", …
## $ BsmtFinSF1    <dbl> 706, 978, 486, 216, 655, 732, 1369, 859, 0, 851, 906, 99…
## $ BsmtFinType2  <chr> "Unf", "Unf", "Unf", "Unf", "Unf", "Unf", "Unf", "BLQ", …
## $ BsmtFinSF2    <dbl> 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ BsmtUnfSF     <dbl> 150, 284, 434, 540, 490, 64, 317, 216, 952, 140, 134, 17…
## $ TotalBsmtSF   <dbl> 856, 1262, 920, 756, 1145, 796, 1686, 1107, 952, 991, 10…
## $ Heating       <chr> "GasA", "GasA", "GasA", "GasA", "GasA", "GasA", "GasA", …
## $ HeatingQC     <chr> "Ex", "Ex", "Ex", "Gd", "Ex", "Ex", "Ex", "Ex", "Gd", "E…
## $ CentralAir    <chr> "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "…
## $ Electrical    <chr> "SBrkr", "SBrkr", "SBrkr", "SBrkr", "SBrkr", "SBrkr", "S…
## $ `1stFlrSF`    <dbl> 856, 1262, 920, 961, 1145, 796, 1694, 1107, 1022, 1077, …
## $ `2ndFlrSF`    <dbl> 854, 0, 866, 756, 1053, 566, 0, 983, 752, 0, 0, 1142, 0,…
## $ LowQualFinSF  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ GrLivArea     <dbl> 1710, 1262, 1786, 1717, 2198, 1362, 1694, 2090, 1774, 10…
## $ BsmtFullBath  <dbl> 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1,…
## $ BsmtHalfBath  <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ FullBath      <dbl> 2, 2, 2, 1, 2, 1, 2, 2, 2, 1, 1, 3, 1, 2, 1, 1, 1, 2, 1,…
## $ HalfBath      <dbl> 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,…
## $ BedroomAbvGr  <dbl> 3, 3, 3, 3, 4, 1, 3, 3, 2, 2, 3, 4, 2, 3, 2, 2, 2, 2, 3,…
## $ KitchenAbvGr  <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1,…
## $ KitchenQual   <chr> "Gd", "TA", "Gd", "Gd", "Gd", "TA", "Gd", "TA", "TA", "T…
## $ TotRmsAbvGrd  <dbl> 8, 6, 6, 7, 9, 5, 7, 7, 8, 5, 5, 11, 4, 7, 5, 5, 5, 6, 6…
## $ Functional    <chr> "Typ", "Typ", "Typ", "Typ", "Typ", "Typ", "Typ", "Typ", …
## $ Fireplaces    <dbl> 0, 1, 1, 1, 1, 0, 1, 2, 2, 2, 0, 2, 0, 1, 1, 0, 1, 0, 0,…
## $ FireplaceQu   <chr> NA, "TA", "TA", "Gd", "TA", NA, "Gd", "TA", "TA", "TA", …
## $ GarageType    <chr> "Attchd", "Attchd", "Attchd", "Detchd", "Attchd", "Attch…
## $ GarageYrBlt   <dbl> 2003, 1976, 2001, 1998, 2000, 1993, 2004, 1973, 1931, 19…
## $ GarageFinish  <chr> "RFn", "RFn", "RFn", "Unf", "RFn", "Unf", "RFn", "RFn", …
## $ GarageCars    <dbl> 2, 2, 2, 3, 3, 2, 2, 2, 2, 1, 1, 3, 1, 3, 1, 2, 2, 2, 2,…
## $ GarageArea    <dbl> 548, 460, 608, 642, 836, 480, 636, 484, 468, 205, 384, 7…
## $ GarageQual    <chr> "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "Fa", "G…
## $ GarageCond    <chr> "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "T…
## $ PavedDrive    <chr> "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "…
## $ WoodDeckSF    <dbl> 0, 298, 0, 0, 192, 40, 255, 235, 90, 0, 0, 147, 140, 160…
## $ OpenPorchSF   <dbl> 61, 0, 42, 35, 84, 30, 57, 204, 0, 4, 0, 21, 0, 33, 213,…
## $ EnclosedPorch <dbl> 0, 0, 0, 272, 0, 0, 0, 228, 205, 0, 0, 0, 0, 0, 176, 0, …
## $ `3SsnPorch`   <dbl> 0, 0, 0, 0, 0, 320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ ScreenPorch   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 176, 0, 0, 0, 0, 0, …
## $ PoolArea      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ PoolQC        <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ Fence         <chr> NA, NA, NA, NA, NA, "MnPrv", NA, NA, NA, NA, NA, NA, NA,…
## $ MiscFeature   <chr> NA, NA, NA, NA, NA, "Shed", NA, "Shed", NA, NA, NA, NA, …
## $ MiscVal       <dbl> 0, 0, 0, 0, 0, 700, 0, 350, 0, 0, 0, 0, 0, 0, 0, 0, 700,…
## $ MoSold        <dbl> 2, 5, 9, 2, 12, 10, 8, 11, 4, 1, 2, 7, 9, 8, 5, 7, 3, 10…
## $ YrSold        <dbl> 2008, 2007, 2008, 2006, 2008, 2009, 2007, 2009, 2008, 20…
## $ SaleType      <chr> "WD", "WD", "WD", "WD", "WD", "WD", "WD", "WD", "WD", "W…
## $ SaleCondition <chr> "Normal", "Normal", "Normal", "Abnorml", "Normal", "Norm…
## $ SalePrice     <dbl> 208500, 181500, 223500, 140000, 250000, 143000, 307000, …

There are 1460 observations and 81 variables including 1 target variable as SalePrice and 80 explainable variables as different house features that may or may not relate the buying decision of a house. Besides, from the first few observations, I realized that there are some variables containing a lot of missing values in which I will investigate further in the next part of the study.

2. Data Exploration

2.1 Target variable

First of all, I started with the most important target variable, SalePrice. I evaluated it further by performing the statistic summary, visualize its distribution through the Histogram and check the skewness and kurtosis of the data distribution.

##Statistics summary of the Sale price variable

summary(train_data$SalePrice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   34900  129975  163000  180921  214000  755000
## create the Histogram to visualize the distribution of SalePrice

p <- ggplot(data= train_data) + aes(x=SalePrice) + geom_histogram() 

fig <- ggplotly(p)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
fig
##Skewness and Kurtosis of SalePrice distribution

Skew_SP <-skewness(train_data$SalePrice)
print(paste("Skewness: ",Skew_SP))
## [1] "Skewness:  1.88094074603404"
Kurt_SP <- kurtosis(train_data$SalePrice)
print(paste("Kurtosis: ", Kurt_SP))
## [1] "Kurtosis:  9.50981201108944"

From the graph, I can see that the SalePrice variable deviates from the normal distribution and is skewed to the right or has positive skewness.

Since skewness measure a degree of asymmetry of data distribution that deviates from the symmetrical normal distribution. The higher level of skewness can cause misleading result. So the skewness value of 1.9 shows that the data are extremely skewed and data transformation tools like log transformation might be needed to make the skewed data closer to a normal distribution.

Kurtosis measures the heaviness of a distribution’s tails relative to a normal distribution. The value of Kurtosis of 9.5, higher than 3, is called positive Kurtosis and the greater the value, the higher the peak.

2.2 Pre-processing of Training data

Before getting into details of the explainable variables, I pre-process data by removing all of the missing values. Firstly, I evaluated all the missing values in the dataset by creating a summary of the total number and percentage of missing values of each variable and put it in descending order.

##Evaluate the missing data
# Calculate total count of missing values per column
train_data_transformed<-train_data
total <- colSums(is.na(train_data_transformed))
total <- total[order(-total)]

# Calculate percentage of missing values per column
percent <- colSums(is.na(train_data_transformed)) / nrow(train_data_transformed)
percent <- percent[order(-percent)]

# Create a dataframe with total count and percent of missing values
missing_data <- data.frame(Total = total, Percent = percent)

# Display the first 20 rows
head(missing_data, 20)

The PoolQC, MiscFeature, Alley, Fence, FireplaceQu, Lot Frontage are the variables with the most null values, with more than 15%. These factors seem to be not so important in the house buying decision, thus these variables could be removed.

The variables related to Garage including GarageType, GarageYrBlt, GarageFinish, GarageQual and GarageCond have similar percentage of null values. In case of different Basement variables such as BsmtExposure, Bsmt FinType2, BsmtQual, BsmtCond, Bsmt FinType1, they also have quite similar numbers of null values. The reason could be because these variables shows a repetitive in response, therefore we could remove these repeated variables. The variables of Masonry veneer such as MasVnrType and MasVnrArea are non-essential variables and thus also removed completely. Meanwhile, the Electric variable only has one null value, I fill such null value with the mean of the Electric variable.

train_data_transformed <- train_data_transformed[, !(colnames(train_data_transformed) %in% rownames(missing_data)[missing_data$Total > 1])]

library(dplyr)

# Function to calculate the mode
get_mode <- function(x) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}

# Replace missing values with mode
train_data_transformed$Electrical <- ifelse(is.na(train_data_transformed$Electrical),
							get_mode(train_data_transformed$Electrical),
							train_data_transformed$Electrical)

#Checking if there is still any null value left
max_col_missing <- max(colSums(is.na(train_data_transformed)))
print(max_col_missing)
## [1] 0

After the removal process, I checked again if there is any missing values available.

2.3 Categorical variables

Then I investigated further into the other 80 explainable variables available. And after understanding the meaning and context of different variables, I make a summary list of features grouping them based on their characteristics and give a preliminary expectation of relation between the explainable variables to the target one in a separate spreadsheet.

Based on my expectations, I selected the most potential features that might have strong relationship with Sale Price, or in other words, the features that are more likely to affect our buying decision of a house. Thus, I ended up with 2 categorical variables and 5 numerical variables among 80 explainable variables.

For the categorical variables, I factorized them into numerical variables to prepare the data for later study.

library(dplyr)

# Define a function to encode categorical variables

encode_categorical <- function(x) {
if (is.factor(x) || is.character(x)) {
levels <- unique(x)
numeric_values <- seq_along(levels)
return(numeric_values[match(x, levels)])
} else {
return(x)
}
}

# Apply the encoding function to the dataframe
train_data_transformed <- train_data_transformed %>% mutate(across(everything(), encode_categorical))

# Print the encoded dataframe
print(train_data_transformed)
## # A tibble: 1,460 × 63
##       Id MSSubClass MSZoning LotArea Street LotShape LandContour Utilities
##    <dbl>      <dbl>    <int>   <dbl>  <int>    <int>       <int>     <int>
##  1     1         60        1    8450      1        1           1         1
##  2     2         20        1    9600      1        1           1         1
##  3     3         60        1   11250      1        2           1         1
##  4     4         70        1    9550      1        2           1         1
##  5     5         60        1   14260      1        2           1         1
##  6     6         50        1   14115      1        2           1         1
##  7     7         20        1   10084      1        1           1         1
##  8     8         60        1   10382      1        2           1         1
##  9     9         50        2    6120      1        1           1         1
## 10    10        190        1    7420      1        1           1         1
## # ℹ 1,450 more rows
## # ℹ 55 more variables: LotConfig <int>, LandSlope <int>, Neighborhood <int>,
## #   Condition1 <int>, Condition2 <int>, BldgType <int>, HouseStyle <int>,
## #   OverallQual <dbl>, OverallCond <dbl>, YearBuilt <dbl>, YearRemodAdd <dbl>,
## #   RoofStyle <int>, RoofMatl <int>, Exterior1st <int>, Exterior2nd <int>,
## #   ExterQual <int>, ExterCond <int>, Foundation <int>, BsmtFinSF1 <dbl>,
## #   BsmtFinSF2 <dbl>, BsmtUnfSF <dbl>, TotalBsmtSF <dbl>, Heating <int>, …
glimpse(train_data_transformed)
## Rows: 1,460
## Columns: 63
## $ Id            <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1…
## $ MSSubClass    <dbl> 60, 20, 60, 70, 60, 50, 20, 60, 50, 190, 20, 60, 20, 20,…
## $ MSZoning      <int> 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1,…
## $ LotArea       <dbl> 8450, 9600, 11250, 9550, 14260, 14115, 10084, 10382, 612…
## $ Street        <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ LotShape      <int> 1, 1, 2, 2, 2, 2, 1, 2, 1, 1, 1, 2, 3, 2, 2, 1, 2, 1, 1,…
## $ LandContour   <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ Utilities     <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ LotConfig     <int> 1, 2, 1, 3, 2, 1, 1, 3, 1, 3, 1, 1, 1, 1, 3, 3, 4, 1, 1,…
## $ LandSlope     <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ Neighborhood  <int> 1, 2, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 10, 1, 12, 9, 12, …
## $ Condition1    <int> 1, 2, 1, 1, 1, 1, 1, 3, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 5,…
## $ Condition2    <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ BldgType      <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 3, 1,…
## $ HouseStyle    <int> 1, 2, 1, 1, 1, 3, 2, 1, 3, 4, 2, 1, 2, 2, 2, 4, 2, 2, 2,…
## $ OverallQual   <dbl> 7, 6, 7, 7, 8, 5, 8, 7, 7, 5, 5, 9, 5, 7, 6, 7, 6, 4, 5,…
## $ OverallCond   <dbl> 5, 8, 5, 5, 5, 5, 5, 6, 5, 6, 5, 5, 6, 5, 5, 8, 7, 5, 5,…
## $ YearBuilt     <dbl> 2003, 1976, 2001, 1915, 2000, 1993, 2004, 1973, 1931, 19…
## $ YearRemodAdd  <dbl> 2003, 1976, 2002, 1970, 2000, 1995, 2005, 1973, 1950, 19…
## $ RoofStyle     <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 2, 1, 1, 1, 1,…
## $ RoofMatl      <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ Exterior1st   <int> 1, 2, 1, 3, 1, 1, 1, 4, 5, 2, 4, 6, 4, 1, 2, 3, 3, 2, 1,…
## $ Exterior2nd   <int> 1, 2, 1, 3, 1, 1, 1, 4, 3, 2, 4, 3, 5, 1, 2, 6, 6, 2, 1,…
## $ ExterQual     <int> 1, 2, 1, 2, 1, 2, 1, 2, 2, 2, 2, 3, 2, 1, 2, 2, 2, 2, 2,…
## $ ExterCond     <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ Foundation    <int> 1, 2, 1, 3, 1, 4, 1, 2, 3, 3, 2, 1, 2, 1, 2, 3, 2, 5, 1,…
## $ BsmtFinSF1    <dbl> 706, 978, 486, 216, 655, 732, 1369, 859, 0, 851, 906, 99…
## $ BsmtFinSF2    <dbl> 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ BsmtUnfSF     <dbl> 150, 284, 434, 540, 490, 64, 317, 216, 952, 140, 134, 17…
## $ TotalBsmtSF   <dbl> 856, 1262, 920, 756, 1145, 796, 1686, 1107, 952, 991, 10…
## $ Heating       <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ HeatingQC     <int> 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 3, 1, 3, 1, 1, 3, 1,…
## $ CentralAir    <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ Electrical    <int> 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1,…
## $ `1stFlrSF`    <dbl> 856, 1262, 920, 961, 1145, 796, 1694, 1107, 1022, 1077, …
## $ `2ndFlrSF`    <dbl> 854, 0, 866, 756, 1053, 566, 0, 983, 752, 0, 0, 1142, 0,…
## $ LowQualFinSF  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ GrLivArea     <dbl> 1710, 1262, 1786, 1717, 2198, 1362, 1694, 2090, 1774, 10…
## $ BsmtFullBath  <dbl> 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1,…
## $ BsmtHalfBath  <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ FullBath      <dbl> 2, 2, 2, 1, 2, 1, 2, 2, 2, 1, 1, 3, 1, 2, 1, 1, 1, 2, 1,…
## $ HalfBath      <dbl> 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,…
## $ BedroomAbvGr  <dbl> 3, 3, 3, 3, 4, 1, 3, 3, 2, 2, 3, 4, 2, 3, 2, 2, 2, 2, 3,…
## $ KitchenAbvGr  <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1,…
## $ KitchenQual   <int> 1, 2, 1, 1, 1, 2, 1, 2, 2, 2, 2, 3, 2, 1, 2, 2, 2, 2, 1,…
## $ TotRmsAbvGrd  <dbl> 8, 6, 6, 7, 9, 5, 7, 7, 8, 5, 5, 11, 4, 7, 5, 5, 5, 6, 6…
## $ Functional    <int> 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ Fireplaces    <dbl> 0, 1, 1, 1, 1, 0, 1, 2, 2, 2, 0, 2, 0, 1, 1, 0, 1, 0, 0,…
## $ GarageCars    <dbl> 2, 2, 2, 3, 3, 2, 2, 2, 2, 1, 1, 3, 1, 3, 1, 2, 2, 2, 2,…
## $ GarageArea    <dbl> 548, 460, 608, 642, 836, 480, 636, 484, 468, 205, 384, 7…
## $ PavedDrive    <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ WoodDeckSF    <dbl> 0, 298, 0, 0, 192, 40, 255, 235, 90, 0, 0, 147, 140, 160…
## $ OpenPorchSF   <dbl> 61, 0, 42, 35, 84, 30, 57, 204, 0, 4, 0, 21, 0, 33, 213,…
## $ EnclosedPorch <dbl> 0, 0, 0, 272, 0, 0, 0, 228, 205, 0, 0, 0, 0, 0, 176, 0, …
## $ `3SsnPorch`   <dbl> 0, 0, 0, 0, 0, 320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ ScreenPorch   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 176, 0, 0, 0, 0, 0, …
## $ PoolArea      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ MiscVal       <dbl> 0, 0, 0, 0, 0, 700, 0, 350, 0, 0, 0, 0, 0, 0, 0, 0, 700,…
## $ MoSold        <dbl> 2, 5, 9, 2, 12, 10, 8, 11, 4, 1, 2, 7, 9, 8, 5, 7, 3, 10…
## $ YrSold        <dbl> 2008, 2007, 2008, 2006, 2008, 2009, 2007, 2009, 2008, 20…
## $ SaleType      <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1,…
## $ SaleCondition <int> 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1,…
## $ SalePrice     <dbl> 208500, 181500, 223500, 140000, 250000, 143000, 307000, …

After that, based on the previous variable investigation and business expectation, I studied the 2 most potential categorical variables and visualize their distribution through the bar graph and use box plots to visualize the relationship between such variable to the target variable, SalePrice.

p_1<- ggplot(data= train_data) + aes(x=OverallQual) + geom_bar() 

fig_1 <- ggplotly(p_1)
fig_1
ggplot(data = train_data) + aes(x = factor(OverallQual), y = SalePrice) + geom_boxplot()

p_2<- ggplot(data= train_data) + aes(x=YearBuilt) + geom_bar() 

fig_2 <- ggplotly(p_2)
fig_2
ggplot(data = train_data) + aes(x = factor(YearBuilt), y = SalePrice) + geom_boxplot()

For the Overall Quality variable, the bar graph shows the most values counted around the value 5 and 6. There’s only few count that can get really high quality of 10. And the box plot shows that there’s a clear positive relationship between Sale price and Overall Quality. The higher the Overall Quality, the higher the Sale Price.

For the Year Built feature, the bar graph shows that there are more houses built in the 2000s. And there’s a slight relationship between YearBuilt and Sale price. eventhough it’s not as clear as the correlation between Sale Price and Overall Quality. But we can see from the box plot, the newer the house, the higher the Sale Price.

2.4 Numerical variables

From the prior investigation and expectation study, the 5 most potential numerical variables includes GrLivArea as Above ground living area,TotalBsmtSF as Total basement area, LotArea as Lot size, TotRmsAbvGrd as Total rooms above grad, GarageArea as Size of garage. I also visualize their distribution on Histogram and establish the scatter plots to compare the relationship between these explainable variables and the target variable Sale Price.

p_3<- ggplot(data= train_data) + aes(x=GrLivArea) + geom_histogram() 

fig_3 <- ggplotly(p_3)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
fig_3
ggplot(data = train_data) +
aes(x = GrLivArea, y = SalePrice) +
geom_jitter()+ geom_smooth(method="gam", formula = y ~s(x)) +  theme(axis.text.x = element_text(angle = 45)) 

p_4<- ggplot(data= train_data) + aes(x=TotalBsmtSF) + geom_histogram() 

fig_4 <- ggplotly(p_4)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
fig_4
ggplot(data = train_data) +
aes(x = TotalBsmtSF, y = SalePrice) +
geom_jitter()+ geom_smooth(method="gam", formula = y ~s(x))

p_5<- ggplot(data= train_data) + aes(x=LotArea) + geom_histogram() 

fig_5 <- ggplotly(p_5)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
fig_5
ggplot(data = train_data) +
aes(x = LotArea, y = SalePrice) +
geom_jitter()+ geom_smooth(method="gam", formula = y ~s(x)) +  theme(axis.text.x = element_text(angle = 45))

p_6<- ggplot(data= train_data) + aes(x=TotRmsAbvGrd) + geom_bar() 

fig_6 <- ggplotly(p_6)
fig_6
ggplot(data = train_data) +
aes(x = TotRmsAbvGrd, y = SalePrice) +
geom_jitter()+ geom_smooth(method="gam", formula = y ~s(x)) +  theme(axis.text.x = element_text(angle = 45))

p_7<- ggplot(data= train_data) + aes(x=GarageArea) + geom_histogram() 

fig_7 <- ggplotly(p_7)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
fig_7
ggplot(data = train_data) +
aes(x = GarageArea, y = SalePrice) +
geom_jitter()+ geom_smooth(method="gam", formula = y ~s(x)) +  theme(axis.text.x = element_text(angle = 45))

From the scatterplots, it’s clear that Sale price has a linear relationship with both GrLivArea and TotalBsmtSF variable. The more the above ground living area, the higher the Sale price. The same thing applied for the Total Basement Square Feet, the larger the basement area leads to the higher the sale price. The other expected variables show only a slight linear relationship with Sale Price.

2.5 Heatmap of correlation

After the quick data examination above, we study further into detail through the use of heatmap to understand the correlation among variables more deeply.

corrmat <- cor(train_data_transformed)
heatmap(corrmat, col = colorRampPalette(c("blue", "white", "red"))(100))

In order to see more clearly the most potential variables that are correlated with sale price. I established the zoomed heatmap with only 10 explainable variables with highest correlation to Sale Price.

n_var<- 10  # number of variables for heatmap
imp_var <- names(head(sort(cor(train_data_transformed)[,"SalePrice"], decreasing = TRUE),n_var))
cor_imp_var <- cor(train_data_transformed[, imp_var])

library(ggplot2)
library(reshape2)

# Reshape correlation matrix to long format
SalePrice_colormap <- melt(cor_imp_var)
SalePrice_colormap$Var1<- imp_var[SalePrice_colormap$Var1]
SalePrice_colormap$Var2<- imp_var[SalePrice_colormap$Var2]

# Create Colormap using ggplot2
ggplot(SalePrice_colormap, aes(x = Var1, y = Var2, fill = value)) +
geom_tile() +
scale_fill_gradient(low = "white", high = "blue") +
geom_text(aes(label = round(value, 2)), color = "black", size = 3) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(x = "", y = "") +
ggtitle("SalePrice Correlation Heatmap") +
theme(plot.title = element_text(hjust = 0.5)) +
coord_fixed() 

From the heatmap above, we can see clearly the 10 variables with highest correlation to Sale price as in the order as followed:

  • Overall Qual
  • GrLivArea
  • GarageCars and Garage Area (these 2 variables are quite similar)
  • TotalBsmtSF
  • 1st FlrSF
  • Full Bath
  • TotRmsAbvGrd (similar to GrLivArea variable)
  • YearBuilt

The result shows quite similar to what my first expectation of the important house features that have high influence to the Sale Price.

Now, we move to the main part of predicting the Sale prices

3. Data modelling

3.1 Pre-processing of Test data

First, I upload the test dataset and simply check the data as performed earlier.

test_data <- read_csv("test.csv")
## Rows: 1459 Columns: 80
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (43): MSZoning, Street, Alley, LotShape, LandContour, Utilities, LotConf...
## dbl (37): Id, MSSubClass, LotFrontage, LotArea, OverallQual, OverallCond, Ye...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(test_data)
summary(test_data)
##        Id         MSSubClass       MSZoning          LotFrontage    
##  Min.   :1461   Min.   : 20.00   Length:1459        Min.   : 21.00  
##  1st Qu.:1826   1st Qu.: 20.00   Class :character   1st Qu.: 58.00  
##  Median :2190   Median : 50.00   Mode  :character   Median : 67.00  
##  Mean   :2190   Mean   : 57.38                      Mean   : 68.58  
##  3rd Qu.:2554   3rd Qu.: 70.00                      3rd Qu.: 80.00  
##  Max.   :2919   Max.   :190.00                      Max.   :200.00  
##                                                     NA's   :227     
##     LotArea         Street             Alley             LotShape        
##  Min.   : 1470   Length:1459        Length:1459        Length:1459       
##  1st Qu.: 7391   Class :character   Class :character   Class :character  
##  Median : 9399   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 9819                                                           
##  3rd Qu.:11518                                                           
##  Max.   :56600                                                           
##                                                                          
##  LandContour         Utilities          LotConfig          LandSlope        
##  Length:1459        Length:1459        Length:1459        Length:1459       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  Neighborhood        Condition1         Condition2          BldgType        
##  Length:1459        Length:1459        Length:1459        Length:1459       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##   HouseStyle         OverallQual      OverallCond      YearBuilt   
##  Length:1459        Min.   : 1.000   Min.   :1.000   Min.   :1879  
##  Class :character   1st Qu.: 5.000   1st Qu.:5.000   1st Qu.:1953  
##  Mode  :character   Median : 6.000   Median :5.000   Median :1973  
##                     Mean   : 6.079   Mean   :5.554   Mean   :1971  
##                     3rd Qu.: 7.000   3rd Qu.:6.000   3rd Qu.:2001  
##                     Max.   :10.000   Max.   :9.000   Max.   :2010  
##                                                                    
##   YearRemodAdd   RoofStyle           RoofMatl         Exterior1st       
##  Min.   :1950   Length:1459        Length:1459        Length:1459       
##  1st Qu.:1963   Class :character   Class :character   Class :character  
##  Median :1992   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :1984                                                           
##  3rd Qu.:2004                                                           
##  Max.   :2010                                                           
##                                                                         
##  Exterior2nd         MasVnrType          MasVnrArea      ExterQual        
##  Length:1459        Length:1459        Min.   :   0.0   Length:1459       
##  Class :character   Class :character   1st Qu.:   0.0   Class :character  
##  Mode  :character   Mode  :character   Median :   0.0   Mode  :character  
##                                        Mean   : 100.7                     
##                                        3rd Qu.: 164.0                     
##                                        Max.   :1290.0                     
##                                        NA's   :15                         
##   ExterCond          Foundation          BsmtQual           BsmtCond        
##  Length:1459        Length:1459        Length:1459        Length:1459       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  BsmtExposure       BsmtFinType1         BsmtFinSF1     BsmtFinType2      
##  Length:1459        Length:1459        Min.   :   0.0   Length:1459       
##  Class :character   Class :character   1st Qu.:   0.0   Class :character  
##  Mode  :character   Mode  :character   Median : 350.5   Mode  :character  
##                                        Mean   : 439.2                     
##                                        3rd Qu.: 753.5                     
##                                        Max.   :4010.0                     
##                                        NA's   :1                          
##    BsmtFinSF2        BsmtUnfSF       TotalBsmtSF     Heating         
##  Min.   :   0.00   Min.   :   0.0   Min.   :   0   Length:1459       
##  1st Qu.:   0.00   1st Qu.: 219.2   1st Qu.: 784   Class :character  
##  Median :   0.00   Median : 460.0   Median : 988   Mode  :character  
##  Mean   :  52.62   Mean   : 554.3   Mean   :1046                     
##  3rd Qu.:   0.00   3rd Qu.: 797.8   3rd Qu.:1305                     
##  Max.   :1526.00   Max.   :2140.0   Max.   :5095                     
##  NA's   :1         NA's   :1        NA's   :1                        
##   HeatingQC          CentralAir         Electrical           1stFlrSF     
##  Length:1459        Length:1459        Length:1459        Min.   : 407.0  
##  Class :character   Class :character   Class :character   1st Qu.: 873.5  
##  Mode  :character   Mode  :character   Mode  :character   Median :1079.0  
##                                                           Mean   :1156.5  
##                                                           3rd Qu.:1382.5  
##                                                           Max.   :5095.0  
##                                                                           
##     2ndFlrSF     LowQualFinSF        GrLivArea     BsmtFullBath   
##  Min.   :   0   Min.   :   0.000   Min.   : 407   Min.   :0.0000  
##  1st Qu.:   0   1st Qu.:   0.000   1st Qu.:1118   1st Qu.:0.0000  
##  Median :   0   Median :   0.000   Median :1432   Median :0.0000  
##  Mean   : 326   Mean   :   3.543   Mean   :1486   Mean   :0.4345  
##  3rd Qu.: 676   3rd Qu.:   0.000   3rd Qu.:1721   3rd Qu.:1.0000  
##  Max.   :1862   Max.   :1064.000   Max.   :5095   Max.   :3.0000  
##                                                   NA's   :2       
##   BsmtHalfBath       FullBath        HalfBath       BedroomAbvGr  
##  Min.   :0.0000   Min.   :0.000   Min.   :0.0000   Min.   :0.000  
##  1st Qu.:0.0000   1st Qu.:1.000   1st Qu.:0.0000   1st Qu.:2.000  
##  Median :0.0000   Median :2.000   Median :0.0000   Median :3.000  
##  Mean   :0.0652   Mean   :1.571   Mean   :0.3777   Mean   :2.854  
##  3rd Qu.:0.0000   3rd Qu.:2.000   3rd Qu.:1.0000   3rd Qu.:3.000  
##  Max.   :2.0000   Max.   :4.000   Max.   :2.0000   Max.   :6.000  
##  NA's   :2                                                        
##   KitchenAbvGr   KitchenQual         TotRmsAbvGrd     Functional       
##  Min.   :0.000   Length:1459        Min.   : 3.000   Length:1459       
##  1st Qu.:1.000   Class :character   1st Qu.: 5.000   Class :character  
##  Median :1.000   Mode  :character   Median : 6.000   Mode  :character  
##  Mean   :1.042                      Mean   : 6.385                     
##  3rd Qu.:1.000                      3rd Qu.: 7.000                     
##  Max.   :2.000                      Max.   :15.000                     
##                                                                        
##    Fireplaces     FireplaceQu         GarageType         GarageYrBlt  
##  Min.   :0.0000   Length:1459        Length:1459        Min.   :1895  
##  1st Qu.:0.0000   Class :character   Class :character   1st Qu.:1959  
##  Median :0.0000   Mode  :character   Mode  :character   Median :1979  
##  Mean   :0.5812                                         Mean   :1978  
##  3rd Qu.:1.0000                                         3rd Qu.:2002  
##  Max.   :4.0000                                         Max.   :2207  
##                                                         NA's   :78    
##  GarageFinish         GarageCars      GarageArea      GarageQual       
##  Length:1459        Min.   :0.000   Min.   :   0.0   Length:1459       
##  Class :character   1st Qu.:1.000   1st Qu.: 318.0   Class :character  
##  Mode  :character   Median :2.000   Median : 480.0   Mode  :character  
##                     Mean   :1.766   Mean   : 472.8                     
##                     3rd Qu.:2.000   3rd Qu.: 576.0                     
##                     Max.   :5.000   Max.   :1488.0                     
##                     NA's   :1       NA's   :1                          
##   GarageCond         PavedDrive          WoodDeckSF       OpenPorchSF    
##  Length:1459        Length:1459        Min.   :   0.00   Min.   :  0.00  
##  Class :character   Class :character   1st Qu.:   0.00   1st Qu.:  0.00  
##  Mode  :character   Mode  :character   Median :   0.00   Median : 28.00  
##                                        Mean   :  93.17   Mean   : 48.31  
##                                        3rd Qu.: 168.00   3rd Qu.: 72.00  
##                                        Max.   :1424.00   Max.   :742.00  
##                                                                          
##  EnclosedPorch       3SsnPorch        ScreenPorch        PoolArea      
##  Min.   :   0.00   Min.   :  0.000   Min.   :  0.00   Min.   :  0.000  
##  1st Qu.:   0.00   1st Qu.:  0.000   1st Qu.:  0.00   1st Qu.:  0.000  
##  Median :   0.00   Median :  0.000   Median :  0.00   Median :  0.000  
##  Mean   :  24.24   Mean   :  1.794   Mean   : 17.06   Mean   :  1.744  
##  3rd Qu.:   0.00   3rd Qu.:  0.000   3rd Qu.:  0.00   3rd Qu.:  0.000  
##  Max.   :1012.00   Max.   :360.000   Max.   :576.00   Max.   :800.000  
##                                                                        
##     PoolQC             Fence           MiscFeature           MiscVal        
##  Length:1459        Length:1459        Length:1459        Min.   :    0.00  
##  Class :character   Class :character   Class :character   1st Qu.:    0.00  
##  Mode  :character   Mode  :character   Mode  :character   Median :    0.00  
##                                                           Mean   :   58.17  
##                                                           3rd Qu.:    0.00  
##                                                           Max.   :17000.00  
##                                                                             
##      MoSold           YrSold       SaleType         SaleCondition     
##  Min.   : 1.000   Min.   :2006   Length:1459        Length:1459       
##  1st Qu.: 4.000   1st Qu.:2007   Class :character   Class :character  
##  Median : 6.000   Median :2008   Mode  :character   Mode  :character  
##  Mean   : 6.104   Mean   :2008                                        
##  3rd Qu.: 8.000   3rd Qu.:2009                                        
##  Max.   :12.000   Max.   :2010                                        
## 
glimpse(test_data)
## Rows: 1,459
## Columns: 80
## $ Id            <dbl> 1461, 1462, 1463, 1464, 1465, 1466, 1467, 1468, 1469, 14…
## $ MSSubClass    <dbl> 20, 20, 60, 60, 120, 60, 20, 60, 20, 20, 120, 160, 160, …
## $ MSZoning      <chr> "RH", "RL", "RL", "RL", "RL", "RL", "RL", "RL", "RL", "R…
## $ LotFrontage   <dbl> 80, 81, 74, 78, 43, 75, NA, 63, 85, 70, 26, 21, 21, 24, …
## $ LotArea       <dbl> 11622, 14267, 13830, 9978, 5005, 10000, 7980, 8402, 1017…
## $ Street        <chr> "Pave", "Pave", "Pave", "Pave", "Pave", "Pave", "Pave", …
## $ Alley         <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ LotShape      <chr> "Reg", "IR1", "IR1", "IR1", "IR1", "IR1", "IR1", "IR1", …
## $ LandContour   <chr> "Lvl", "Lvl", "Lvl", "Lvl", "HLS", "Lvl", "Lvl", "Lvl", …
## $ Utilities     <chr> "AllPub", "AllPub", "AllPub", "AllPub", "AllPub", "AllPu…
## $ LotConfig     <chr> "Inside", "Corner", "Inside", "Inside", "Inside", "Corne…
## $ LandSlope     <chr> "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", …
## $ Neighborhood  <chr> "NAmes", "NAmes", "Gilbert", "Gilbert", "StoneBr", "Gilb…
## $ Condition1    <chr> "Feedr", "Norm", "Norm", "Norm", "Norm", "Norm", "Norm",…
## $ Condition2    <chr> "Norm", "Norm", "Norm", "Norm", "Norm", "Norm", "Norm", …
## $ BldgType      <chr> "1Fam", "1Fam", "1Fam", "1Fam", "TwnhsE", "1Fam", "1Fam"…
## $ HouseStyle    <chr> "1Story", "1Story", "2Story", "2Story", "1Story", "2Stor…
## $ OverallQual   <dbl> 5, 6, 5, 6, 8, 6, 6, 6, 7, 4, 7, 6, 5, 6, 7, 9, 8, 9, 8,…
## $ OverallCond   <dbl> 6, 6, 5, 6, 5, 5, 7, 5, 5, 5, 5, 5, 5, 6, 6, 5, 5, 5, 5,…
## $ YearBuilt     <dbl> 1961, 1958, 1997, 1998, 1992, 1993, 1992, 1998, 1990, 19…
## $ YearRemodAdd  <dbl> 1961, 1958, 1998, 1998, 1992, 1994, 2007, 1998, 1990, 19…
## $ RoofStyle     <chr> "Gable", "Hip", "Gable", "Gable", "Gable", "Gable", "Gab…
## $ RoofMatl      <chr> "CompShg", "CompShg", "CompShg", "CompShg", "CompShg", "…
## $ Exterior1st   <chr> "VinylSd", "Wd Sdng", "VinylSd", "VinylSd", "HdBoard", "…
## $ Exterior2nd   <chr> "VinylSd", "Wd Sdng", "VinylSd", "VinylSd", "HdBoard", "…
## $ MasVnrType    <chr> "None", "BrkFace", "None", "BrkFace", "None", "None", "N…
## $ MasVnrArea    <dbl> 0, 108, 0, 20, 0, 0, 0, 0, 0, 0, 0, 504, 492, 0, 0, 162,…
## $ ExterQual     <chr> "TA", "TA", "TA", "TA", "Gd", "TA", "TA", "TA", "TA", "T…
## $ ExterCond     <chr> "TA", "TA", "TA", "TA", "TA", "TA", "Gd", "TA", "TA", "T…
## $ Foundation    <chr> "CBlock", "CBlock", "PConc", "PConc", "PConc", "PConc", …
## $ BsmtQual      <chr> "TA", "TA", "Gd", "TA", "Gd", "Gd", "Gd", "Gd", "Gd", "T…
## $ BsmtCond      <chr> "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "T…
## $ BsmtExposure  <chr> "No", "No", "No", "No", "No", "No", "No", "No", "Gd", "N…
## $ BsmtFinType1  <chr> "Rec", "ALQ", "GLQ", "GLQ", "ALQ", "Unf", "ALQ", "Unf", …
## $ BsmtFinSF1    <dbl> 468, 923, 791, 602, 263, 0, 935, 0, 637, 804, 1051, 156,…
## $ BsmtFinType2  <chr> "LwQ", "Unf", "Unf", "Unf", "Unf", "Unf", "Unf", "Unf", …
## $ BsmtFinSF2    <dbl> 144, 0, 0, 0, 0, 0, 0, 0, 0, 78, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ BsmtUnfSF     <dbl> 270, 406, 137, 324, 1017, 763, 233, 789, 663, 0, 354, 32…
## $ TotalBsmtSF   <dbl> 882, 1329, 928, 926, 1280, 763, 1168, 789, 1300, 882, 14…
## $ Heating       <chr> "GasA", "GasA", "GasA", "GasA", "GasA", "GasA", "GasA", …
## $ HeatingQC     <chr> "TA", "TA", "Gd", "Ex", "Ex", "Gd", "Ex", "Gd", "Gd", "T…
## $ CentralAir    <chr> "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "…
## $ Electrical    <chr> "SBrkr", "SBrkr", "SBrkr", "SBrkr", "SBrkr", "SBrkr", "S…
## $ `1stFlrSF`    <dbl> 896, 1329, 928, 926, 1280, 763, 1187, 789, 1341, 882, 13…
## $ `2ndFlrSF`    <dbl> 0, 0, 701, 678, 0, 892, 0, 676, 0, 0, 0, 504, 567, 601, …
## $ LowQualFinSF  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ GrLivArea     <dbl> 896, 1329, 1629, 1604, 1280, 1655, 1187, 1465, 1341, 882…
## $ BsmtFullBath  <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ BsmtHalfBath  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ FullBath      <dbl> 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 2, 1, 1, 2, 1, 2, 2, 2, 2,…
## $ HalfBath      <dbl> 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0,…
## $ BedroomAbvGr  <dbl> 2, 3, 3, 3, 2, 3, 3, 3, 2, 2, 2, 2, 3, 3, 2, 3, 3, 3, 3,…
## $ KitchenAbvGr  <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ KitchenQual   <chr> "TA", "Gd", "TA", "Gd", "Gd", "TA", "TA", "TA", "Gd", "T…
## $ TotRmsAbvGrd  <dbl> 5, 6, 6, 7, 5, 7, 6, 7, 5, 4, 5, 5, 6, 6, 4, 10, 7, 7, 8…
## $ Functional    <chr> "Typ", "Typ", "Typ", "Typ", "Typ", "Typ", "Typ", "Typ", …
## $ Fireplaces    <dbl> 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1,…
## $ FireplaceQu   <chr> NA, NA, "TA", "Gd", NA, "TA", NA, "Gd", "Po", NA, "Fa", …
## $ GarageType    <chr> "Attchd", "Attchd", "Attchd", "Attchd", "Attchd", "Attch…
## $ GarageYrBlt   <dbl> 1961, 1958, 1997, 1998, 1992, 1993, 1992, 1998, 1990, 19…
## $ GarageFinish  <chr> "Unf", "Unf", "Fin", "Fin", "RFn", "Fin", "Fin", "Fin", …
## $ GarageCars    <dbl> 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 1, 3, 3, 3, 3,…
## $ GarageArea    <dbl> 730, 312, 482, 470, 506, 440, 420, 393, 506, 525, 511, 2…
## $ GarageQual    <chr> "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "T…
## $ GarageCond    <chr> "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "T…
## $ PavedDrive    <chr> "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "…
## $ WoodDeckSF    <dbl> 140, 393, 212, 360, 0, 157, 483, 0, 192, 240, 203, 275, …
## $ OpenPorchSF   <dbl> 0, 36, 34, 36, 82, 84, 21, 75, 0, 0, 68, 0, 0, 0, 30, 13…
## $ EnclosedPorch <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `3SsnPorch`   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ ScreenPorch   <dbl> 120, 0, 0, 0, 144, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ PoolArea      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ PoolQC        <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ Fence         <chr> "MnPrv", NA, "MnPrv", NA, NA, NA, "GdPrv", NA, NA, "MnPr…
## $ MiscFeature   <chr> NA, "Gar2", NA, NA, NA, NA, "Shed", NA, NA, NA, NA, NA, …
## $ MiscVal       <dbl> 0, 12500, 0, 0, 0, 0, 500, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ MoSold        <dbl> 6, 6, 3, 6, 1, 4, 3, 5, 2, 4, 6, 2, 3, 6, 6, 1, 6, 6, 2,…
## $ YrSold        <dbl> 2010, 2010, 2010, 2010, 2010, 2010, 2010, 2010, 2010, 20…
## $ SaleType      <chr> "WD", "WD", "WD", "WD", "WD", "WD", "WD", "WD", "WD", "W…
## $ SaleCondition <chr> "Normal", "Normal", "Normal", "Normal", "Normal", "Norma…

The test dataset only contains 1459 observations and 80 variables compared to the training dataset as the target variable SalePrice is not available.

Then I pre-process the test data by removing the missing values as before.

##Evaluate the missing data
# Calculate total count of missing values per column
test_data_transformed<-test_data
t_total<- colSums(is.na(test_data_transformed))
t_total<- t_total[order(-t_total)]

# Calculate percentage of missing values per column
t_percent<- colSums(is.na(test_data_transformed)) / nrow(test_data_transformed)
t_percent<- t_percent[order(-t_percent)]

# Create a dataframe with total count and percent of missing values
missing_data_t<- data.frame(Total_test = t_total, Percent_test = t_percent)

# Display the first 20 rows
head(missing_data_t, 40)

I also removed completely the variables with more than 15% of missing data and the non-essential ones. For the remaining variables, i filled the remaining missing values with the mean of such variable if they are numerical or return the mode if the variables are categorical and checked again if there is any missing values left.

test_data_transformed <- test_data_transformed[, !(colnames(test_data_transformed) %in% rownames(missing_data_t)[missing_data_t$Total_test > 4])]


# Replace missing values with mode for the categorical variables or mean for the numerical variables
test_data_transformed <- test_data_transformed %>%
mutate(
MSZoning = ifelse(is.na(MSZoning), as.character(get_mode(MSZoning)), MSZoning),
Utilities = ifelse(is.na(Utilities), as.character(get_mode(Utilities)), Utilities),
BsmtFullBath = ifelse(is.na(BsmtFullBath), mean(BsmtFullBath, na.rm = TRUE), BsmtFullBath),
BsmtHalfBath = ifelse(is.na(BsmtHalfBath), mean(BsmtHalfBath, na.rm = TRUE), BsmtHalfBath),
Functional = ifelse(is.na(Functional), as.character(get_mode(Functional)), Functional),
Exterior1st = ifelse(is.na(Exterior1st), as.character(get_mode(Exterior1st)), Exterior1st),
Exterior2nd = ifelse(is.na(Exterior2nd), as.character(get_mode(Exterior2nd)), Exterior2nd),
BsmtFinSF1 = ifelse(is.na(BsmtFinSF1), mean(BsmtFinSF1, na.rm = TRUE), BsmtFinSF1),
BsmtFinSF2 = ifelse(is.na(BsmtFinSF2), mean(BsmtFinSF2, na.rm = TRUE), BsmtFinSF2),
BsmtUnfSF = ifelse(is.na(BsmtUnfSF), mean(BsmtUnfSF, na.rm = TRUE), BsmtUnfSF),
TotalBsmtSF = ifelse(is.na(TotalBsmtSF), mean(TotalBsmtSF, na.rm = TRUE), TotalBsmtSF),
KitchenQual = ifelse(is.na(KitchenQual), as.character(get_mode(KitchenQual)), KitchenQual),
GarageCars = ifelse(is.na(GarageCars), mean(GarageCars, na.rm = TRUE), GarageCars),
GarageArea = ifelse(is.na(GarageArea), mean(GarageArea, na.rm = TRUE), GarageArea),
SaleType = ifelse(is.na(SaleType), as.character(get_mode(SaleType)), SaleType)
)

#Checking if there is still any null value left
max_col_missing_t<- max(colSums(is.na(test_data_transformed)))
print(max_col_missing_t)
## [1] 0

Then i encoded the categorical variables in test dataset into numerical variables to prepare for data aggregation.

# Apply the encoding function (established earlier) to the dataframe
test_data_transformed <- test_data_transformed %>% mutate(across(everything(), encode_categorical))

# Print the encoded dataframe
print(test_data_transformed)
## # A tibble: 1,459 × 62
##       Id MSSubClass MSZoning LotArea Street LotShape LandContour Utilities
##    <dbl>      <dbl>    <int>   <dbl>  <int>    <int>       <int>     <int>
##  1  1461         20        1   11622      1        1           1         1
##  2  1462         20        2   14267      1        2           1         1
##  3  1463         60        2   13830      1        2           1         1
##  4  1464         60        2    9978      1        2           1         1
##  5  1465        120        2    5005      1        2           2         1
##  6  1466         60        2   10000      1        2           1         1
##  7  1467         20        2    7980      1        2           1         1
##  8  1468         60        2    8402      1        2           1         1
##  9  1469         20        2   10176      1        1           1         1
## 10  1470         20        2    8400      1        1           1         1
## # ℹ 1,449 more rows
## # ℹ 54 more variables: LotConfig <int>, LandSlope <int>, Neighborhood <int>,
## #   Condition1 <int>, Condition2 <int>, BldgType <int>, HouseStyle <int>,
## #   OverallQual <dbl>, OverallCond <dbl>, YearBuilt <dbl>, YearRemodAdd <dbl>,
## #   RoofStyle <int>, RoofMatl <int>, Exterior1st <int>, Exterior2nd <int>,
## #   ExterQual <int>, ExterCond <int>, Foundation <int>, BsmtFinSF1 <dbl>,
## #   BsmtFinSF2 <dbl>, BsmtUnfSF <dbl>, TotalBsmtSF <dbl>, Heating <int>, …
glimpse(test_data_transformed)
## Rows: 1,459
## Columns: 62
## $ Id            <dbl> 1461, 1462, 1463, 1464, 1465, 1466, 1467, 1468, 1469, 14…
## $ MSSubClass    <dbl> 20, 20, 60, 60, 120, 60, 20, 60, 20, 20, 120, 160, 160, …
## $ MSZoning      <int> 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 3, 3, 2, 2, 2, 2, 2, 2,…
## $ LotArea       <dbl> 11622, 14267, 13830, 9978, 5005, 10000, 7980, 8402, 1017…
## $ Street        <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ LotShape      <int> 1, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 1, 1, 1, 1, 2, 2, 1, 2,…
## $ LandContour   <int> 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ Utilities     <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ LotConfig     <int> 1, 2, 1, 1, 1, 2, 1, 1, 1, 2, 3, 1, 1, 3, 3, 1, 2, 1, 1,…
## $ LandSlope     <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ Neighborhood  <int> 1, 1, 2, 2, 3, 2, 2, 2, 2, 1, 1, 4, 4, 5, 5, 6, 6, 6, 6,…
## $ Condition1    <int> 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2,…
## $ Condition2    <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ BldgType      <int> 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 3, 3, 3, 3, 1, 1, 1, 1,…
## $ HouseStyle    <int> 1, 1, 2, 2, 1, 2, 1, 2, 1, 1, 1, 2, 2, 2, 1, 2, 1, 1, 1,…
## $ OverallQual   <dbl> 5, 6, 5, 6, 8, 6, 6, 6, 7, 4, 7, 6, 5, 6, 7, 9, 8, 9, 8,…
## $ OverallCond   <dbl> 6, 6, 5, 6, 5, 5, 7, 5, 5, 5, 5, 5, 5, 6, 6, 5, 5, 5, 5,…
## $ YearBuilt     <dbl> 1961, 1958, 1997, 1998, 1992, 1993, 1992, 1998, 1990, 19…
## $ YearRemodAdd  <dbl> 1961, 1958, 1998, 1998, 1992, 1994, 2007, 1998, 1990, 19…
## $ RoofStyle     <int> 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,…
## $ RoofMatl      <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ Exterior1st   <int> 1, 2, 1, 1, 3, 3, 3, 1, 3, 4, 5, 3, 3, 4, 4, 1, 1, 1, 6,…
## $ Exterior2nd   <int> 1, 2, 1, 1, 3, 3, 3, 1, 3, 4, 5, 3, 3, 6, 6, 1, 1, 1, 7,…
## $ ExterQual     <int> 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 3, 2, 2, 2,…
## $ ExterCond     <int> 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ Foundation    <int> 1, 1, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1, 1, 1, 1, 2, 2, 2, 2,…
## $ BsmtFinSF1    <dbl> 468, 923, 791, 602, 263, 0, 935, 0, 637, 804, 1051, 156,…
## $ BsmtFinSF2    <dbl> 144, 0, 0, 0, 0, 0, 0, 0, 0, 78, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ BsmtUnfSF     <dbl> 270, 406, 137, 324, 1017, 763, 233, 789, 663, 0, 354, 32…
## $ TotalBsmtSF   <dbl> 882, 1329, 928, 926, 1280, 763, 1168, 789, 1300, 882, 14…
## $ Heating       <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ HeatingQC     <int> 1, 1, 2, 3, 3, 2, 3, 2, 2, 1, 3, 1, 1, 1, 3, 3, 3, 3, 3,…
## $ CentralAir    <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ Electrical    <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ `1stFlrSF`    <dbl> 896, 1329, 928, 926, 1280, 763, 1187, 789, 1341, 882, 13…
## $ `2ndFlrSF`    <dbl> 0, 0, 701, 678, 0, 892, 0, 676, 0, 0, 0, 504, 567, 601, …
## $ LowQualFinSF  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ GrLivArea     <dbl> 896, 1329, 1629, 1604, 1280, 1655, 1187, 1465, 1341, 882…
## $ BsmtFullBath  <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ BsmtHalfBath  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ FullBath      <dbl> 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 2, 1, 1, 2, 1, 2, 2, 2, 2,…
## $ HalfBath      <dbl> 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0,…
## $ BedroomAbvGr  <dbl> 2, 3, 3, 3, 2, 3, 3, 3, 2, 2, 2, 2, 3, 3, 2, 3, 3, 3, 3,…
## $ KitchenAbvGr  <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ KitchenQual   <int> 1, 2, 1, 2, 2, 1, 1, 1, 2, 1, 2, 1, 1, 2, 1, 3, 2, 3, 3,…
## $ TotRmsAbvGrd  <dbl> 5, 6, 6, 7, 5, 7, 6, 7, 5, 4, 5, 5, 6, 6, 4, 10, 7, 7, 8…
## $ Functional    <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ Fireplaces    <dbl> 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1,…
## $ GarageCars    <dbl> 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 1, 3, 3, 3, 3,…
## $ GarageArea    <dbl> 730, 312, 482, 470, 506, 440, 420, 393, 506, 525, 511, 2…
## $ PavedDrive    <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ WoodDeckSF    <dbl> 140, 393, 212, 360, 0, 157, 483, 0, 192, 240, 203, 275, …
## $ OpenPorchSF   <dbl> 0, 36, 34, 36, 82, 84, 21, 75, 0, 0, 68, 0, 0, 0, 30, 13…
## $ EnclosedPorch <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `3SsnPorch`   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ ScreenPorch   <dbl> 120, 0, 0, 0, 144, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ PoolArea      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ MiscVal       <dbl> 0, 12500, 0, 0, 0, 0, 500, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ MoSold        <dbl> 6, 6, 3, 6, 1, 4, 3, 5, 2, 4, 6, 2, 3, 6, 6, 1, 6, 6, 2,…
## $ YrSold        <dbl> 2010, 2010, 2010, 2010, 2010, 2010, 2010, 2010, 2010, 20…
## $ SaleType      <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 3, 3, 1, 1,…
## $ SaleCondition <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1,…

3.2 Data aggregation

Before data aggregation, I check if the data types are consistent between the training and test dataset. After confirming the data type consistency, i then proceed to concatenate the 2 datasets and name it all_data.

##Check the data type consistency between the train and test dataset
# Get the column names for data type consistency check
selected_columns <- intersect(colnames(train_data_transformed), colnames(test_data_transformed))

# Get the data types of the corresponding columns in each data frame
train_types <- sapply(train_data_transformed[selected_columns], class)
test_types <- sapply(test_data_transformed[selected_columns], class)

# Check for data type consistency
consistent_data_type <- all(train_types == test_types)

# Print the result
if (consistent_data_type) {
cat("Data types are consistent between the two datasets \n")
} else {
cat("Data types are not consistent between the two datasets \n")
cat("Data types in train dataset:\n")
print(train_types)
cat("Data types in test dataset:\n")
print(test_types)
}
## Data types are consistent between the two datasets
## Data aggregation

all_data <- rbind(train_data_transformed[,selected_columns],test_data_transformed[,selected_columns])
all_data <- all_data[, !names(all_data) %in% "Id"]
all_data

3.3 Data transformation

Afterwards, I performed data transformation for the Sale Price variable to address skewness to avoid misleading result and prepare for data modelling. The log transformation proposes the calculations of the natural logarithm for each value in the dataset. The Sale Price data after log transformation is visualized with the histogram.

library("ggplot2")

# Create prices DataFrame
log_prices <- data.frame(price = train_data_transformed$SalePrice, log_price = log1p(train_data_transformed$SalePrice))

# Generate histogram
#ggplot(log_prices, aes(x = log_price)) + geom_histogram()
p_log <- ggplot(data= log_prices) + aes(x=log_price) + geom_histogram() 
fig_p_log <-ggplotly(p_log)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
fig_p_log

I also selected the numeric features that are extremely skewed to scale them to make them more normal distributed to fit model requirement.

library(e1071)
## 
## Attaching package: 'e1071'
## The following objects are masked from 'package:moments':
## 
##     kurtosis, moment, skewness
# Log transform the target variable
train_data_transformed$SalePrice <- log1p(train_data_transformed$SalePrice)

# Log transform skewed numeric features
numeric_features <- names(Filter(is.numeric, all_data))

# Compute skewness
skewed_features <- sapply(train_data_transformed[, numeric_features], function(x) skewness(x))
skewed_features <- names(skewed_features[skewed_features > 0.75])

all_data[, skewed_features] <- log1p(all_data[, skewed_features])

3.4 Ridge Regression Model

Afterwards, I prepared data by creating the training and test feature matrices for modeling. Then, I defined the function for calculating the Root Mean Squared Error (RMSE) for cross-validation purpose. I then use glmnet function to initialize the ridge regression model using the training feature matrix, y target variable and lambda penalty parameter.

Alpha values are used as tuning parameters for testing and saved in a vector called alphas. Cross-validation is then performed by iterating through different alpha values in the alphas vector. A ridge regression model is fitted for each alpha value, the RMSE is calculated using the defined function and the results are stored in crvalid_ridge.

The cross-validation results are then converted to a dataframe with column for alpha and RMSE. The plot was generated based on such dataframe to show the relationship between alphas and RMSE.

The minimum RMSE and the according optimal alpha value are found by filtering the result in the dataframe above.

The final ridge regression model is fit using the optimal alpha value of 0.05. The model is trained on the complete training data and now ready for predicting sale prices on the test data by using X_test as the input feature matrix

##Data preparation: Creating matrices in R for modeling:
X_train <- all_data[1:nrow(train_data_transformed), ]
X_test <- all_data[(nrow(train_data_transformed) + 1):nrow(all_data), ]
y <- train_data_transformed$SalePrice


## Package and Function loading:
# Load the required packages

library(glmnet)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## Loaded glmnet 4.1-7
library(ggplot2)

# RMSE Calculation Function:
rmse_cv <- function(model) {
rmse <- sqrt(mean((predict(model, newx = as.matrix(X_train)) - y)^2))
return(rmse)
}

# Initialize Ridge model
model_ridge <- glmnet(as.matrix(X_train), y, alpha = 0, lambda = 1)


##Cross-validation and Model Selection:
# Define alpha values- tuning parameters for Ridge model
alphas <- c(0.05, 0.1, 0.3, 1, 3, 5, 10, 15, 30, 50, 75)

# Perform cross-validation for different alpha values
crvalid_ridge <- sapply(alphas, function(alpha) {
model <- glmnet(as.matrix(X_train), y, alpha = 0, lambda = alpha)
rmse_cv(model)
})

# Convert the result to a data frame
crvalid_ridge <- data.frame(alpha = alphas, rmse = crvalid_ridge)

# Plot the RMSE values against alpha
ggplot(crvalid_ridge, aes(x = alpha, y = rmse)) +
geom_line() +
labs(title = "Validation", x = "alpha", y = "rmse")

##Model Tuning and Final Model:
# Find the alpha value with the minimum RMSE
min_rmse <- min(crvalid_ridge$rmse)
min_rmse
## [1] 0.1298651
crvalid_ridge %>% filter(rmse == min_rmse)
#Fit the final ridege regrestion model using the optimal alpha value
final_model <- glmnet(as.matrix(X_train), y, alpha = 0, lambda = 0.05)

In order to generate predictions for the test data, the final model is used and X_test is passed as the new feature matrix for prediction.

predictions <- predict(final_model, newx = as.matrix(X_test))
head(predictions)
##            s0
## [1,] 11.80711
## [2,] 12.00854
## [3,] 12.08711
## [4,] 12.16929
## [5,] 12.11982
## [6,] 12.04464
# Create a data frame with the test IDs and predicted sale prices
submission <- data.frame(Id = test_data_transformed$Id, Sale_Price = predictions)

# Change the column label to "Sale Price"
colnames(submission)[2] <- "SalePrice"

# Write the submission data frame to a CSV file
write.csv(submission, file = "House_Prices_submission_WinnPham.csv", row.names = FALSE)